# --------------------------------------------------------
# ' clean political advertising data
# --------------------------------------------------------
library(rio)
library(data.table)
library(car)
library(maps)

# specify your main directory here
here = '~/Dropbox/project-archive/BG-bk/dataverse'

# read all data
data2000 = data.table(import(file.path(here,'data','rawdata_political_ad','Political Advertising in 2000.dta')))

data2008 = data.table(import(file.path(here,'data','rawdata_political_ad','WiscAds2008_Presidential.dta')))
data2008_gsh = data.table(import(file.path(here,'data','rawdata_political_ad','WiscAds2008_GSHData.dta')))

house2016 = data.table(import(file.path(here,'data','rawdata_political_ad','wmp-house-2016-v1.0.dta')))
senate2016 = data.table(import(file.path(here,'data','rawdata_political_ad','wmp-senate-2016-v1.0.dta')))
gov2016 = data.table(import(file.path(here,'data','rawdata_political_ad','wmp-gov-2016-v1.0.dta')))

#------------------------------------------------------------------------------
# 2000
#==============================================================================
# drop when statcode is missing
data2000 = data2000[!is.na(statcode),]

# process date 
data2000[,year := year(spotdate)]
data2000[,month := month(spotdate)]

#q14. In your judgment, is the primary purpose of the ad to promote a specific candidate ("In his distinguished career, Senator Jones has brought millions of dollars home. We need Senator Jones"), to attack a candidate ("In his long years in Washington, Senator Jones has raised your taxes over and over. We can't afford 6 more years of Jones.") or to contrast the candidates ("While Senator Jones has been raising your taxes, Representative Smith has been cutting them.")?
#0 'Not applicable'
#1 'Attack'
#2 'Contrast'
#3 'Promote'
#4 'Unsure or Unclear'
data2000[q14==0, q14 := NA]
data2000[,tone_attack   := ifelse(q14 == 1, 1, 0)]
data2000[,tone_contrast := ifelse(q14 == 2, 1, 0)]
data2000[,tone_promote  := ifelse(q14 == 3, 1, 0)]
data2000[,tone_unclear  := ifelse(q14 == 4, 1, 0)]

#q5. What is the party of the favored candidate? 
#1 'Democrat'
#2 'Republican'
#3 'Other'
data2000[q5 == 9, q5 := NA]
data2000[, democrat := ifelse(q5 == 1, 1, 0)]
data2000[, republican := ifelse(q5 == 2, 1, 0)]
data2000[, other := ifelse(q5 == 3, 1, 0)]

ad2000 = data2000[,.(
	n_ad = .N, 
	sum_tone_attack   = sum(tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(tone_promote  ,na.rm=TRUE),
	sum_tone_unclear  = sum(tone_unclear  ,na.rm=TRUE),

	sum_democrat = sum(democrat, na.rm=TRUE),
	sum_republican = sum(republican, na.rm=TRUE),
	sum_other = sum(other, na.rm=TRUE),
	sum_cost=sum(cost,na.rm=TRUE)
	), by=c('statcode','spotdate')]

# process state information
state_code = '("AL" =1) ("AK" = 2) ("AZ" =4) ("AR" =5) ("CA" =6) ("CO"=8) ("CT" =9) ("DE" =10) ("DC" =11) ("FL" =12) ("GA" =13) ("HI"=15) ("ID" = 16) ("IL" =17) ("IN" =18) ("IA"=19) ("KS" =20) ("KY" =21) ("LA"=22) ("ME" =23) ("MD" =24) ("MA" = 25) ("MI" = 26) ("MN" =27) ("MS"=28) ("MO" = 29) ("NE" = 31) ("NV" =32) ("NH" = 33) ("NJ" = 34) ("NM" = 35) ("NY" =36) ("NC" = 37) ("ND"=38) ("OH" =39) ("OK" =40) ("OR"=41) ("PA" =42) ("RI" =44) ("SC" =45) ("SD"=46) ("TN"=47) ("TX" =48) ("UT" =49) ("VT"=50) ("VA" =51) ("WA" =53) ("WV" =54) ("WI" =55) ("US" =99)'
state_code = lapply(strsplit(state_code,'\\)')[[1]],function(x) x[1])
list_code = list()
for (i in 1:length(state_code)){
	tmp = state_code[i][[1]]
	st = gsub('[^[:alnum:]]','',strsplit(gsub('\\(','',tmp),'=')[[1]][1])
	no = strsplit(gsub('\\(','',tmp),'=')[[1]][2]
	list_code[[i]] =data.table(state=st,statcode=as.integer(no))
}
list_code = rbindlist(list_code)

# merge with state code data
ad2000 = merge(x=ad2000,y=list_code,by='statcode',all.x=TRUE)
ad2000[,statcode := NULL]
setnames(ad2000,'spotdate','date')
setorder(ad2000,'state','date')

# drop when date is missing
ad2000 = ad2000[!is.na(date),]

# generate daily panels 
min_max_date = ad2000[,.(min_date = min(date), max_date=max(date)),by='state']
min_max_date[,n_date := as.numeric(max_date-min_date)+1]
all_ad2000 = min_max_date[rep(1:.N,n_date)][,index := 1:.N, by='state']
all_ad2000[,date := min_date + index-1]

all_ad2000 = merge(x=all_ad2000, y=ad2000,by=c('state','date'),all.x=TRUE)

# assign zero when there's no ad aired for particulate dates
var_list = c('n_ad','sum_tone_attack','sum_tone_contrast','sum_tone_promote','sum_tone_unclear',
	'sum_democrat','sum_republican','sum_other','sum_cost')
for (var in var_list){
	all_ad2000[is.na(get(var)),(var) := 0]	
}

# generate culumative sum for each state across different date
setorder(all_ad2000, state, date)
for (var in var_list){
	all_ad2000[,(paste0('cumsum_',var)) := cumsum(get(var)), by=c('state')]	
}

#------------------------------------------------------------------------------
# 2008
# combine presidential ad data wigh GSH data
market_state_2008 = unique(data2008[,c('market','STATE_1')])
market_state_2008 = market_state_2008[STATE_1 != '',]
market_state_2008 = market_state_2008[market != 'Springfield, M',]

data2008_gsh = merge(x=data2008_gsh, y=market_state_2008,by=c('market'),all.x=TRUE)
data2008 = rbind(data2008,data2008_gsh,fill=TRUE)
#==============================================================================

#AD_TONE In your judgment, is the primary purpose of the ad to promote a specific candidate, attack a candidate, or contrast the candidates?
#1 Contrast
#2 Promote
#3 Attack
#98 DK
#99 NA
data2008[AD_TONE %in% c(99), AD_TONE := NA]
data2008[,tone_attack   := ifelse(AD_TONE == 3, 1, 0)]
data2008[,tone_contrast := ifelse(AD_TONE == 1, 1, 0)]
data2008[,tone_promote  := ifelse(AD_TONE == 2, 1, 0)]
data2008[,tone_unclear  := ifelse(AD_TONE == 98, 1, 0)]

#PARTY Party of favored candidate
#1 Democrat
#2 Republican
#3 Independent
#4 Green
#5 Libertarian
#90 other
#98 DK
#99 Missing
data2008[party %in% c(98,99), party := NA]
data2008[, democrat := ifelse(party == 1, 1, 0)]
data2008[, republican := ifelse(party == 2, 1, 0)]

# specify date; note here the original days_to measure is +/- from the election date in 2008
data2008[,date := as.Date('2008-11-04',format='%Y-%m-%d')-Days_To]
data2008[,year := year(date)]
data2008[,month := month(date)]

ad2008 = data2008[,.(
	n_ad = .N, 
	sum_tone_attack   = sum(tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(tone_promote  ,na.rm=TRUE),
	sum_tone_unclear  = sum(tone_unclear  ,na.rm=TRUE),

	sum_democrat = sum(democrat, na.rm=TRUE),
	sum_republican = sum(republican, na.rm=TRUE),
	sum_cost=sum(EST_SPENDING,na.rm=TRUE)
	), by=c('STATE_1','date')]

# drop when state or date is missing
ad2008 = ad2008[STATE_1 != '',]
ad2008 = ad2008[!is.na(date),]

# clean state indicator 
ad2008[,state := STATE_1]
ad2008[STATE_1 == 'NATIONAL',state := 'US']
ad2008[,STATE_1 := NULL]

setorder(ad2008, 'state','date')

# generate daily panels 
min_max_date = ad2008[,.(min_date = min(date), max_date=max(date)),by='state']
min_max_date[,n_date := as.numeric(max_date-min_date)+1]
all_ad2008 = min_max_date[rep(1:.N,n_date)][,index := 1:.N, by='state']
all_ad2008[,date := min_date + index-1]
all_ad2008 = merge(x=all_ad2008, y=ad2008,by=c('state','date'),all.x=TRUE)

# assign zero when there's no ad aired for particulate dates
var_list = c('n_ad','sum_tone_attack','sum_tone_contrast','sum_tone_promote','sum_tone_unclear',
	'sum_democrat','sum_republican','sum_cost')
for (var in var_list){
	all_ad2008[is.na(get(var)),(var) := 0]	
}

# cumsum each measure over time
setorder(all_ad2008, state, date)
for (var in var_list){
	all_ad2008[,(paste0('cumsum_',var)) := cumsum(get(var)), by=c('state')]	
}

#------------------------------------------------------------------------------
# 20016
#==============================================================================

# process date information
house2016[,year := year(airdate)]
house2016[,month := month(airdate)]

senate2016[,year := year(airdate)]
senate2016[,month := month(airdate)]

gov2016[,year := year(airdate)]
gov2016[,month := month(airdate)]

house2016[,election_type := 'house']
senate2016[,election_type := 'senate']
gov2016[,election_type := 'gov']

# combine different data
data2016 = rbind(house2016,senate2016,gov2016)

# remove data from environments that are not used anymore
rm(house2016, senate2016, gov2016)

#ad_tone: In your judgment, is the primary purpose of the ad to promote a specific candidate, attack a candidate, or contrast the candidates?
#1 Contrast 2 Promote 3 Attack
data2016[,tone_attack   := ifelse(ad_tone == 3, 1, 0)]
data2016[,tone_contrast := ifelse(ad_tone == 1, 1, 0)]
data2016[,tone_promote  := ifelse(ad_tone == 2, 1, 0)]

#PARTY Party of favored candidate
#1 Democrat
#2 Republican
#3 Independent
#4 Green
#5 Libertarian
#90 other
#98 DK
#99 Missing

data2016[, democrat := ifelse(party == 'DEMOCRAT', 1, 0)]
data2016[, republican := ifelse(party == 'REPUBLICAN', 1, 0)]
data2016[, other := ifelse(party == 'OTHER' | party == 'LIBERTARIAN', 1, 0)]

ad2016 = data2016[,.(
	n_ad = .N, 
	sum_tone_attack   = sum(tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(tone_promote  ,na.rm=TRUE),
	sum_democrat = sum(democrat, na.rm=TRUE),
	sum_republican = sum(republican, na.rm=TRUE),
	sum_other = sum(other, na.rm=TRUE),
	sum_cost=sum(est_cost,na.rm=TRUE)
	), by=c('state','airdate')]

# process state/date information
ad2016[state=='National Cable',state := 'US']

setnames(ad2016,'airdate','date')
ad2016 = ad2016[!is.na(date),]

# create daily panel
min_max_date = ad2016[,.(min_date = min(date), max_date=max(date)),by='state']
min_max_date[,n_date := as.numeric(max_date-min_date)+1]
all_ad2016 = min_max_date[rep(1:.N,n_date)][,index := 1:.N, by='state']
all_ad2016[,date := min_date + index-1]

all_ad2016 = merge(x=all_ad2016, y=ad2016,by=c('state','date'),all.x=TRUE)

# assign zero when there's no ad aired for particulate dates
var_list = c('n_ad','sum_tone_attack','sum_tone_contrast','sum_tone_promote',
	'sum_democrat','sum_republican','sum_other','sum_cost')
for (var in var_list){
	all_ad2016[is.na(get(var)),(var) := 0]	
}

# cumsum each measure over time
setorder(all_ad2016, state, date)
for (var in var_list){
	all_ad2016[,(paste0('cumsum_',var)) := cumsum(get(var)), by=c('state')]	
}

#------------------------------------------------------------------------------
# aggeraget data per election
all_ad2000[, year := 2000]
year_ad2000 = all_ad2000[,.(
	n_ad = sum(n_ad,na.rm=TRUE), 
	sum_tone_attack   = sum(sum_tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(sum_tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(sum_tone_promote  ,na.rm=TRUE),
	sum_tone_unclear  = sum(sum_tone_unclear  ,na.rm=TRUE),
	sum_democrat = sum(sum_democrat, na.rm=TRUE),
	sum_republican = sum(sum_republican, na.rm=TRUE),
	sum_other = sum(sum_other, na.rm=TRUE),	
	sum_cost = sum(sum_cost,na.rm=TRUE)
	),by=c('state','year')]

all_ad2008[, year := 2008]
year_ad2008 = all_ad2008[,.(
	n_ad = sum(n_ad,na.rm=TRUE), 
	sum_tone_attack   = sum(sum_tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(sum_tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(sum_tone_promote  ,na.rm=TRUE),
	sum_tone_unclear  = sum(sum_tone_unclear  ,na.rm=TRUE),
	sum_democrat = sum(sum_democrat, na.rm=TRUE),
	sum_republican = sum(sum_republican, na.rm=TRUE),
	sum_cost = sum(sum_cost,na.rm=TRUE)
	),by=c('state','year')]

all_ad2016[, year := 2016]
year_ad2016 = all_ad2016[,.(
	n_ad = sum(n_ad,na.rm=TRUE), 
	sum_tone_attack   = sum(sum_tone_attack   ,na.rm=TRUE),
	sum_tone_contrast = sum(sum_tone_contrast ,na.rm=TRUE),
	sum_tone_promote  = sum(sum_tone_promote  ,na.rm=TRUE),
	sum_democrat = sum(sum_democrat, na.rm=TRUE),
	sum_republican = sum(sum_republican, na.rm=TRUE),
	sum_other = sum(sum_other, na.rm=TRUE),	
	sum_cost = sum(sum_cost,na.rm=TRUE)
	),by=c('state','year')]

ad_year = rbind(year_ad2000,year_ad2008,year_ad2016,fill=TRUE)
export(ad_year, file.path(here,'data','processed','political_ad_year.dta'))
#==============================================================================

#------------------------------------------------------------------------------
# date-by-date aggregation
ad_all = rbind(all_ad2000,all_ad2008,all_ad2016,fill=TRUE)
ad_all = ad_all[,c('state','date','n_ad',grep('cumsum_|sum_',names(ad_all),value=TRUE)),with=FALSE]
names(ad_all) = gsub('cumsum_','cs_',names(ad_all))

export(ad_all, file.path(here,'data','processed','political_ad_daily.dta'))
#==============================================================================





